Este notebook queda dividido en tres secciones:
# Herramientas para el manejo de dataframes y lectura de datos
import pandas as pd
from pandas import read_csv
# Validación cruzada y división del conjunto de datos
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
# Guardado y carga de los modelos entrenados
import joblib as joblib
# Gráficos
import matplotlib.pyplot as plt
# Para la aplicación de los test estadísticos
import scikit_posthocs as scp
import scipy.stats as stats
import operator
import numpy as np
import statsmodels as st
import pingouin as pg
# Métricas de error
import math
import sklearn.metrics as sm
# Gráficas
import plotly.express as px
from flask import Flask
import flask
from plotly.subplots import make_subplots
import plotly.graph_objects as go
En esta primera sección visualizamos de forma gráfica y numérica los errores cometidos en las predicciones tanto en training (en cada uno de los pliegues) como en test en la predicción t. De esta forma podremos apoyarnos en los resultados para detectar modelos que pueden estar haciendo overfitting.
RMSE_train = read_csv('../Datos_preprocesados/RMSE_errores_train.csv', encoding ='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
RMSE_train.drop(columns = "Pliegues", inplace = True)
RMSE_train
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.010250 | 0.012253 | 0.040040 | 0.021895 | 0.010777 | 0.012175 | 0.015468 | 0.017608 | 0.009779 | 0.011421 | 0.009565 | 0.011302 |
| 1 | 0.003465 | 0.005035 | 0.006011 | 0.008859 | 0.005831 | 0.004163 | 0.006113 | 0.004916 | 0.003236 | 0.005702 | 0.005437 | 0.004665 |
| 2 | 0.002359 | 0.002950 | 0.002599 | 0.004666 | 0.002282 | 0.002253 | 0.007964 | 0.006088 | 0.002058 | 0.001663 | 0.002220 | 0.002200 |
| 3 | 0.005817 | 0.005213 | 0.008027 | 0.008158 | 0.006893 | 0.007117 | 0.012659 | 0.018517 | 0.005418 | 0.005386 | 0.006962 | 0.007347 |
| 4 | 0.019228 | 0.018612 | 0.015113 | 0.017661 | 0.014720 | 0.014294 | 0.018877 | 0.022049 | 0.018109 | 0.019953 | 0.015303 | 0.015526 |
| 5 | 0.097717 | 0.099564 | 0.030118 | 0.027917 | 0.031447 | 0.029454 | 0.031626 | 0.026445 | 0.094637 | 0.099538 | 0.031492 | 0.028937 |
| 6 | 0.054043 | 0.054168 | 0.047621 | 0.046455 | 0.048174 | 0.044944 | 0.050317 | 0.046399 | 0.049422 | 0.054665 | 0.048253 | 0.045141 |
| 7 | 0.117732 | 0.113723 | 0.064359 | 0.061369 | 0.063967 | 0.061773 | 0.066249 | 0.059950 | 0.118328 | 0.118405 | 0.064465 | 0.062454 |
| 8 | 0.124888 | 0.141807 | 0.085793 | 0.091884 | 0.085716 | 0.091056 | 0.089009 | 0.093796 | 0.121939 | 0.142605 | 0.085260 | 0.091053 |
| 9 | 0.060135 | 0.064386 | 0.029916 | 0.038327 | 0.029938 | 0.036199 | 0.029868 | 0.036866 | 0.048429 | 0.046525 | 0.029645 | 0.035670 |
Resumen con los datos estadísticos más importantes de la tabla de RMSE sobre los diez pliegues
RMSE_train.describe()
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
| mean | 0.049563 | 0.051771 | 0.032960 | 0.032719 | 0.029975 | 0.030343 | 0.032815 | 0.033264 | 0.047135 | 0.050586 | 0.029860 | 0.030430 |
| std | 0.048881 | 0.051414 | 0.027178 | 0.027624 | 0.028047 | 0.028913 | 0.027536 | 0.027363 | 0.048149 | 0.052089 | 0.028119 | 0.028922 |
| min | 0.002359 | 0.002950 | 0.002599 | 0.004666 | 0.002282 | 0.002253 | 0.006113 | 0.004916 | 0.002058 | 0.001663 | 0.002220 | 0.002200 |
| 25% | 0.006925 | 0.006973 | 0.009799 | 0.011060 | 0.007864 | 0.008381 | 0.013362 | 0.017836 | 0.006509 | 0.007132 | 0.007613 | 0.008336 |
| 50% | 0.036635 | 0.036390 | 0.030017 | 0.024906 | 0.022329 | 0.021874 | 0.024372 | 0.024247 | 0.033269 | 0.033239 | 0.022474 | 0.022231 |
| 75% | 0.088322 | 0.090770 | 0.045726 | 0.044423 | 0.043993 | 0.042758 | 0.045644 | 0.044016 | 0.083333 | 0.088320 | 0.044063 | 0.042774 |
| max | 0.124888 | 0.141807 | 0.085793 | 0.091884 | 0.085716 | 0.091056 | 0.089009 | 0.093796 | 0.121939 | 0.142605 | 0.085260 | 0.091053 |
Diagrama de cajas interactivo para la tabla RMSE
fig = px.box(RMSE_train, y = RMSE_train.columns, width = 700, height = 400)
fig.show()
Media y desviación típica del error para cada uno de los modelos.
datos = {'Media' : RMSE_train.mean(),'Desviacion tipica' :RMSE_train.std()}
pd.DataFrame(data = datos)
| Media | Desviacion tipica | |
|---|---|---|
| RF_lag3 | 0.049563 | 0.048881 |
| RF_lag5 | 0.051771 | 0.051414 |
| LR_lag3 | 0.032960 | 0.027178 |
| LR_lag5 | 0.032719 | 0.027624 |
| Lasso_lag3 | 0.029975 | 0.028047 |
| Lasso_lag5 | 0.030343 | 0.028913 |
| SVR_lag3 | 0.032815 | 0.027536 |
| SVR_lag5 | 0.033264 | 0.027363 |
| GBR_lag3 | 0.047135 | 0.048149 |
| GBR_lag5 | 0.050586 | 0.052089 |
| EN_lag3 | 0.029860 | 0.028119 |
| EN_lag5 | 0.030430 | 0.028922 |
MAE_train = read_csv('../Datos_preprocesados/MAE_errores_train.csv', encoding ='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
MAE_train.drop(columns = "Pliegues", inplace = True)
MAE_train
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.008462 | 0.011392 | 0.032479 | 0.014992 | 0.007227 | 0.007082 | 0.012623 | 0.017494 | 0.007774 | 0.009879 | 0.006476 | 0.005981 |
| 1 | 0.003400 | 0.005021 | 0.004186 | 0.003734 | 0.004024 | 0.002734 | 0.005366 | 0.003984 | 0.003021 | 0.005115 | 0.003658 | 0.003443 |
| 2 | 0.001714 | 0.002173 | 0.001866 | 0.003266 | 0.001329 | 0.001685 | 0.005990 | 0.004277 | 0.001440 | 0.001243 | 0.001306 | 0.001632 |
| 3 | 0.002247 | 0.002934 | 0.003994 | 0.002763 | 0.002256 | 0.002159 | 0.007597 | 0.010134 | 0.001828 | 0.001652 | 0.001959 | 0.002001 |
| 4 | 0.012167 | 0.013229 | 0.011721 | 0.007150 | 0.011192 | 0.012083 | 0.014557 | 0.018390 | 0.012484 | 0.013561 | 0.012501 | 0.012333 |
| 5 | 0.084448 | 0.087158 | 0.016943 | 0.016096 | 0.017917 | 0.016959 | 0.021310 | 0.012787 | 0.082231 | 0.087985 | 0.020494 | 0.016918 |
| 6 | 0.033824 | 0.033456 | 0.026278 | 0.026127 | 0.025823 | 0.027261 | 0.034723 | 0.028557 | 0.022534 | 0.032382 | 0.026455 | 0.025546 |
| 7 | 0.065309 | 0.056947 | 0.036142 | 0.032730 | 0.035894 | 0.033170 | 0.040139 | 0.031767 | 0.074404 | 0.061051 | 0.037323 | 0.032217 |
| 8 | 0.098532 | 0.118379 | 0.052798 | 0.042889 | 0.055291 | 0.043395 | 0.059637 | 0.043899 | 0.099885 | 0.115925 | 0.054401 | 0.042336 |
| 9 | 0.043867 | 0.055397 | 0.020134 | 0.024806 | 0.020085 | 0.023185 | 0.018272 | 0.020647 | 0.030652 | 0.040988 | 0.019380 | 0.022019 |
MAE_train.describe()
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
| mean | 0.035397 | 0.038609 | 0.020654 | 0.017455 | 0.018104 | 0.016971 | 0.022021 | 0.019194 | 0.033625 | 0.036978 | 0.018395 | 0.016443 |
| std | 0.036337 | 0.040109 | 0.016466 | 0.013860 | 0.017191 | 0.014463 | 0.017656 | 0.012644 | 0.037485 | 0.039825 | 0.017260 | 0.013971 |
| min | 0.001714 | 0.002173 | 0.001866 | 0.002763 | 0.001329 | 0.001685 | 0.005366 | 0.003984 | 0.001440 | 0.001243 | 0.001306 | 0.001632 |
| 25% | 0.004666 | 0.006614 | 0.006070 | 0.004588 | 0.004825 | 0.003821 | 0.008853 | 0.010797 | 0.004210 | 0.006306 | 0.004363 | 0.004077 |
| 50% | 0.022996 | 0.023343 | 0.018539 | 0.015544 | 0.014555 | 0.014521 | 0.016415 | 0.017942 | 0.017509 | 0.022971 | 0.015940 | 0.014626 |
| 75% | 0.059948 | 0.056559 | 0.030929 | 0.025797 | 0.024388 | 0.026242 | 0.031370 | 0.026579 | 0.063466 | 0.056035 | 0.024965 | 0.024665 |
| max | 0.098532 | 0.118379 | 0.052798 | 0.042889 | 0.055291 | 0.043395 | 0.059637 | 0.043899 | 0.099885 | 0.115925 | 0.054401 | 0.042336 |
fig = px.box(MAE_train, y = MAE_train.columns, width = 700, height = 400)
fig.show()
datos = {'Media' : MAE_train.mean(),'Desviacion tipica' : MAE_train.std()}
pd.DataFrame(data = datos)
| Media | Desviacion tipica | |
|---|---|---|
| RF_lag3 | 0.035397 | 0.036337 |
| RF_lag5 | 0.038609 | 0.040109 |
| LR_lag3 | 0.020654 | 0.016466 |
| LR_lag5 | 0.017455 | 0.013860 |
| Lasso_lag3 | 0.018104 | 0.017191 |
| Lasso_lag5 | 0.016971 | 0.014463 |
| SVR_lag3 | 0.022021 | 0.017656 |
| SVR_lag5 | 0.019194 | 0.012644 |
| GBR_lag3 | 0.033625 | 0.037485 |
| GBR_lag5 | 0.036978 | 0.039825 |
| EN_lag3 | 0.018395 | 0.017260 |
| EN_lag5 | 0.016443 | 0.013971 |
CC_train = read_csv('../Datos_preprocesados/CC_train.csv', encoding ='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
CC_train.drop(columns = "Pliegues", inplace = True)
CC_train
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.660 | 0.851 | 0.396 | 0.465 | 0.431 | 0.433 | 0.485 | 0.465 | 0.592 | 0.875 | 0.458 | 0.450 |
| 1 | 0.725 | 0.647 | 0.065 | -0.042 | 0.160 | 0.047 | 0.245 | -0.009 | 0.269 | 0.734 | 0.202 | 0.109 |
| 2 | 0.866 | 0.907 | 0.467 | 0.556 | 0.486 | 0.571 | 0.491 | 0.553 | 0.646 | 0.895 | 0.497 | 0.571 |
| 3 | 0.926 | 0.934 | 0.337 | 0.433 | 0.320 | 0.445 | 0.336 | 0.535 | 0.871 | 0.942 | 0.315 | 0.442 |
| 4 | 0.863 | 0.889 | 0.568 | 0.601 | 0.561 | 0.602 | 0.689 | 0.748 | 0.786 | 0.981 | 0.571 | 0.598 |
| 5 | 0.928 | 0.932 | 0.793 | 0.803 | 0.796 | 0.803 | 0.824 | 0.826 | 0.938 | 0.985 | 0.796 | 0.802 |
| 6 | 0.837 | 0.910 | 0.517 | 0.583 | 0.508 | 0.600 | 0.477 | 0.690 | 0.909 | 0.976 | 0.501 | 0.598 |
| 7 | 0.943 | 0.944 | 0.888 | 0.913 | 0.887 | 0.910 | 0.860 | 0.873 | 0.966 | 0.981 | 0.884 | 0.907 |
| 8 | 0.933 | 0.948 | 0.864 | 0.885 | 0.862 | 0.879 | 0.817 | 0.811 | 0.989 | 0.981 | 0.861 | 0.879 |
| 9 | 0.804 | 0.782 | 0.620 | 0.586 | 0.609 | 0.591 | 0.621 | 0.638 | 0.844 | 0.968 | 0.608 | 0.587 |
CC_train.describe()
| RF_lag3 | RF_lag5 | LR_lag3 | LR_lag5 | Lasso_lag3 | Lasso_lag5 | SVR_lag3 | SVR_lag5 | GBR_lag3 | GBR_lag5 | EN_lag3 | EN_lag5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
| mean | 0.848500 | 0.874400 | 0.551500 | 0.578300 | 0.562000 | 0.588100 | 0.584500 | 0.613000 | 0.781000 | 0.931800 | 0.569300 | 0.594300 |
| std | 0.095342 | 0.094558 | 0.255903 | 0.274144 | 0.235375 | 0.252187 | 0.212598 | 0.257212 | 0.222787 | 0.079698 | 0.225801 | 0.236204 |
| min | 0.660000 | 0.647000 | 0.065000 | -0.042000 | 0.160000 | 0.047000 | 0.245000 | -0.009000 | 0.269000 | 0.734000 | 0.202000 | 0.109000 |
| 25% | 0.812250 | 0.860500 | 0.413750 | 0.487750 | 0.444750 | 0.476500 | 0.479000 | 0.539500 | 0.681000 | 0.906750 | 0.467750 | 0.480250 |
| 50% | 0.864500 | 0.908500 | 0.542500 | 0.584500 | 0.534500 | 0.595500 | 0.556000 | 0.664000 | 0.857500 | 0.972000 | 0.536000 | 0.592500 |
| 75% | 0.927500 | 0.933500 | 0.749750 | 0.752500 | 0.749250 | 0.752750 | 0.785000 | 0.795250 | 0.930750 | 0.981000 | 0.749000 | 0.751000 |
| max | 0.943000 | 0.948000 | 0.888000 | 0.913000 | 0.887000 | 0.910000 | 0.860000 | 0.873000 | 0.989000 | 0.985000 | 0.884000 | 0.907000 |
fig = px.box(CC_train, y = CC_train.columns, width = 700, height = 400)
fig.show()
datos = {'Media' : CC_train.mean(),'Desviacion tipica' : CC_train.std()}
pd.DataFrame(data = datos)
| Media | Desviacion tipica | |
|---|---|---|
| RF_lag3 | 0.8485 | 0.095342 |
| RF_lag5 | 0.8744 | 0.094558 |
| LR_lag3 | 0.5515 | 0.255903 |
| LR_lag5 | 0.5783 | 0.274144 |
| Lasso_lag3 | 0.5620 | 0.235375 |
| Lasso_lag5 | 0.5881 | 0.252187 |
| SVR_lag3 | 0.5845 | 0.212598 |
| SVR_lag5 | 0.6130 | 0.257212 |
| GBR_lag3 | 0.7810 | 0.222787 |
| GBR_lag5 | 0.9318 | 0.079698 |
| EN_lag3 | 0.5693 | 0.225801 |
| EN_lag5 | 0.5943 | 0.236204 |
Como hemos comentado, nos apoyaremos en las gráficas y en aquellos modelos para los que haya una diferencia de 0'20 o más entre las predicciones de train y test, que consideraremos que están haciendo overfitting.
# Lectura de los predictores
df_predictor_lag3_escalado = read_csv('../Datos_preprocesados/predictor_lag3_escalado.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
df_predictor_lag3_escalado = df_predictor_lag3_escalado.set_index('Fecha')
df_predictor_lag5_escalado = read_csv('../Datos_preprocesados/predictor_lag5_escalado.csv', encoding='latin-1', sep = ',', na_values = ['NaN', 'NaT'])
df_predictor_lag5_escalado = df_predictor_lag5_escalado.set_index('Fecha')
# División del conjunto de datos
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(df_predictor_lag3_escalado.drop(['Incidentes'], axis=1),
df_predictor_lag3_escalado['Incidentes'], train_size = 0.8, test_size = 0.2,
random_state = 42, shuffle = False)
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(df_predictor_lag5_escalado.drop(['Incidentes'], axis=1),
df_predictor_lag5_escalado['Incidentes'], train_size = 0.8, test_size = 0.2,
random_state = 42, shuffle = False)
Cargamos una lista con todos los modelos entrenados para poder iterar después sobre ella.
modelos = []
modelos.append(joblib.load('rf_lag3_escalado.pkl'))
modelos.append(joblib.load('rf_lag5_escalado.pkl'))
modelos.append(joblib.load('regresion_lineal_lag3_escalado.pkl'))
modelos.append(joblib.load('regresion_lineal_lag5_escalado.pkl'))
modelos.append(joblib.load('lasso_lag3_escalado.pkl'))
modelos.append(joblib.load('lasso_lag5_escalado.pkl'))
modelos.append(joblib.load('svr_lag3_escalado.pkl'))
modelos.append(joblib.load('svr_lag5_escalado.pkl'))
modelos.append(joblib.load('gbr_lag3_escalado.pkl'))
modelos.append(joblib.load('gbr_lag5_escalado.pkl'))
modelos.append(joblib.load('en_lag3_escalado.pkl'))
modelos.append(joblib.load('en_lag5_escalado.pkl'))
nombres = ['RF Lag3', 'RF Lag5', 'LR Lag3', 'LR Lag5', 'Lasso Lag3', 'Lasso Lag5', 'SVR Lag3', 'SVR Lag5', 'GBR Lag3', 'GBR Lag5', 'EN Lag3', 'EN Lag5']
La siguiente función calcula las predicciones en el instante t sobre el conjunto de train y el conjunto de test, para cada uno de los modelos. De esta forma, generaremos para cada modelo dos gráficas unidas. La de la izquierda contendrá los datos reales de train junto con la predicción y la de la derecha contendrá los datos reales de test junto con la predicción. Además calcula el MAE y el RMSE generado por las predicciones, tanto para train como para test.
def pred_train_test(modelo, lag):
# Seleccionamos los conjuntos train - test según el lag del modelo
if lag == 3:
X_train = X_train_3
y_train = y_train_3
X_test = X_test_3
y_test = y_test_3
if lag == 5:
X_train = X_train_5
y_train = y_train_5
X_test = X_test_5
y_test = y_test_5
fig = make_subplots(rows=1, cols=2)
# Predicción en train
y_pred_train_modelo = modelo.predict(X_train)
# Predicción en test
y_pred_test_modelo = modelo.predict(X_test)
# Añadimos los valores reales de train VS la predicción en train
fig.add_trace(go.Scatter(x = y_train.index, y = y_train, mode = 'lines', name = 'Train Real'), row = 1, col = 1)
fig.add_trace(go.Scatter(x = y_train.index, y = y_pred_train_modelo, mode = 'lines', name = 'Pred Train'), row = 1, col = 1)
# Añadimos los valores reales de test VS la predicción en test
fig.add_trace(go.Scatter(x = y_test.index, y = y_test, mode = 'lines', name = 'Test Real'), row = 1, col = 2)
fig.add_trace(go.Scatter(x = y_test.index, y = y_pred_test_modelo, mode = 'lines', name = 'Pred test'), row = 1, col = 2)
# Añadimos título
fig.update_layout(showlegend = True, title_text = 'Predicciones Train VS Test ' + str(nombres[i]))
flask.Markup(fig)
fig.show("notebook")
# Calculamos el error cometido MAE en la predicción en train y test
mae_train = sm.mean_absolute_error(y_train, y_pred_train_modelo)
mae_test = sm.mean_absolute_error(y_test, y_pred_test_modelo)
# Calculamos el error cometido RMSE en la predicción en train y test
rmse_train = math.sqrt(sm.mean_squared_error(y_train, y_pred_train_modelo))
rmse_test = math.sqrt(sm.mean_squared_error(y_test, y_pred_test_modelo))
return mae_train, mae_test, rmse_train, rmse_test
# Listas con los errores (RMSE y MAE) de las predicciones de train y test para cada modelo
maes_train = []
maes_test = []
rmses_train = []
rmses_test = []
# Iteramos sobre los modelos
for i in range(0, len(modelos)):
nombre = nombres[i]
# Si es numero par
if i % 2 == 0:
lag = 3
else:
lag = 5
# Realizamos las predicciones, mostrando las gráficas y obtenemos los errores
mae_train, mae_test, rmse_train, rmse_test = pred_train_test(modelos[i], lag)
# Añadimos los errores a las listas correspondientes
maes_train.append(mae_train)
maes_test.append(mae_test)
rmses_train.append(rmse_train)
rmses_test.append(rmse_test)